# -*- coding:utf-8 -*-

import requests
from lxml import etree


class Ksw():
    """00ksw.com网站爬取"""

    # def __init__(self):
    #     #self.url = url

    # 获取网页内容
    def request(self, url):
        req = requests.get(url)
        req.encoding = 'gbk'
        # pa=etree.HTML(cod)
        return req.text

    # 利用lxml精准定位
    def get_list(self, url):
        xpa = etree.HTML(self.request(url))
        a = xpa.xpath("/html/body/div/div[3]/div[2]/div[1]/div[3]/ul/li/a")
        href = []
        for i in a:
            href.append('%s%s' % (url, i.attrib['href']))
        return href

    # 获取小说内容    [^\u4e00-\u9fa5]
    def get_story(self, url):
        xpa = etree.HTML(self.request(url))
        title = xpa.xpath('// *[ @ id = "nr_content"] / div[2] / h3')
        content = xpa.xpath('//*[@id="articlecontent"]/p')

        story = []
        for i in content:
            if "mimiread" not in i.text:  # 判断内容是否为广告
                story.append(i.text)

        return {
            "title" : title[0].text,
            "content":'\r\n'.join(story)
        }

    def delete_ad(self, ad):
        # re.sub(r'([.\u54aa\u9605\u8bfba-zA-Z\u3010-\u3011\\])','','\r\n'.join())
        pass
